/*******************************************************************************
Last update: July 10, 2023 
********************************************************************************/

cls
clear all
local dir "~\Dropbox\Working Papers\Distinction Effect"
cd "`dir'"

qui do "Does\Data Processing\0. Cleaning Programs.do"

/********************************************************************************/
use "Data\Originals\SPADIES\Spadies 2016.dta", clear

keep if e_graduado == 1
keep if prim_sem == periodo

//College Graduates 2004 to 2008
keep if grado_per >= 20041 & grado_per <= 20082

//Clean National ID

//All types of documents
//Check #obs across time before dropping them
tostring doc_num, gen(docnum) format(%30.0g)

cap drop out_sample
gen out_sample = .
replace out_sample = 1 if doc_num < 15 | doc_num > 100000000000
replace out_sample = 1 if substr(docnum, -5, 5) == "00000"
replace out_sample = 1 if substr(docnum, -5, 5) == "11111"
replace out_sample = 1 if substr(docnum, -5, 5) == "99999"
replace out_sample = 1 if substr(docnum, 1, 5) == "00000"
replace out_sample = 1 if substr(docnum, 1, 5) == "11111"
replace out_sample = 1 if substr(docnum, 1, 5) == "99999"

//Cedula de Ciudadania
replace out_sample = 1 if doc_num < 1000000 & doc_tipo == "C" //Less than 1M
replace out_sample = 1 if doc_num >= 10000000000 & doc_tipo == "C" //More than 10 digits
replace out_sample = 1 if doc_num >= 2000000000 & doc_tipo == "C" //More than 1 billion (thousand million)

//Tarjeta de Identidad
replace out_sample = 1 if doc_num <  1000000000 & doc_tipo == "T" //Less than 10 digits
replace out_sample = 1 if doc_num >= 100000000000 & doc_tipo == "T" //More than 11 digits

gen aux = substr(docnum, 3, 2)
destring aux, replace
replace out_sample = 1 if (aux > 12 | aux < 1) & doc_tipo == "T" & length(docnum) == 11
drop aux

gen aux = substr(docnum, 5, 2)
destring aux, replace
replace out_sample = 1 if (aux > 31 | aux < 1) & doc_tipo == "T" & length(docnum) == 11
drop aux

duplicates tag doc_tipo doc_num, gen(dup)
replace out_sample = 1 if dup > 3
drop dup

duplicates tag doc_tipo doc_num ies prim_sem, gen(dup)
replace out_sample = 1 if dup > 0
drop dup

duplicates tag doc_tipo doc_num ies, gen(dup)
replace out_sample = 1 if dup > 1
drop dup

//Check before dropping
drop if out_sample == 1 //11,707 (1.8% of data)
drop out_sample docnum

drop if doc_tipo == ""
keep if doc_tipo == "C" | doc_tipo == "T"

bys doc_tipo doc_num : egen min = min(periodo)
keep if periodo == min
drop min

//Sample Restrictions
//Saber 11 Between 2000 to 2002
keep if periodo_icfes >= 20 & periodo_icfes <= 25
keep if puntaje_icfes != .

gen n = 1
bys ies: egen N = total(n) 
keep if N >= 20
drop n N

rename puntaje_icfes icfes 
collapse (mean) icfes (p75) i75=icfes (p25) i25=icfes (p50) i50=icfes, by(ies)

# delimit ;
twoway  (rspike i75 i25 icfes, lcolor(red*.5)) 
		(scatter i50 icfes, mcolor(black) msize(vsmall))
		, 
		scheme(s1color) plotregion(style(none)) ylabel(0(25)100, angle(0) format(%4.0f) labsize(large)) 
		yscale(r(0 100) titlegap(0.5)) xlabel(0(25)100, format(%4.0f) labsize(large)) xscale(r(0 100) titlegap(2)) 
		legend(off) graphregion(margin(medlarge))
        xtitle("College reputation",size(large)) 
		ytitle("Icfes median and 25-75 range",size(large))
;# delimit cr

gsort -icfes

rename ies cllg_code
rename icfes cllg_reputation

keep cllg_*
save "Data/Finals/CollegeReputation.dta", replace 
